#!/usr/bin/env python2.3
#
#  Validation Test.py
#
#  This program takes in an expression data file
#  and a BPM network and outputs the result of 
#  the validation test described in "Evaluating
#  Between-Pathway Models with Expression Data",
#  published in RECOMB 2009
#
#  Created by Max Leiserson on 7/30/08.
#


import time
import sys
import string
import random
import os


def readBPMs (response):
	
	FILE = open (response, 'r')
	line = 'not null'
	partition1 = []
	partition2 = []
	i = 0
	
	while line != '':
	
		line = FILE.readline()
		line = line.rstrip()
		array = line.split ("\t")

		if ((i %2) == 0):
			partition1.append (array)

		else:
			partition2.append (array)
		
		i = i + 1


	total = []
	
	total.append (partition1)
	total.append (partition2)
	response = response.strip('BPM Networks/')
	response = response.rstrip('.tx')
	if response.count('KI ') > 0:
		currentTime = 'Output/KI Pruned'  + '/' + time.strftime("%d%b%Y", time.localtime()) +' ' + time.strftime("%H_%M_%S", time.localtime())
		os.mkdir (currentTime)
		os.chdir (currentTime)
	else:	
		currentTime = 'Output/' + response + '/' + time.strftime("%d%b%Y", time.localtime()) +' ' + time.strftime("%H_%M_%S", time.localtime())
		os.mkdir (currentTime)
		os.chdir (currentTime)
	print 'Dataset:', response
	

	return total

# retrieve all the KO data from the hughes dataset
def getKnockouts(expression_data):
	
		#open file
		geneList = []
		knockoutInfo =[]
		prunedList = [] #all the genes in the hughes file
		#fix later - read until file ends
		for i in range(0, 4):
				geneList.append(expression_data.readline())
		
		#print geneList[2]
		
		tempGenes = geneList[2].split('\t')
		del tempGenes[0:4]
		hughGeneList=[]
		indices = []
		
		#put gene column titles *top row* stored at 0
		for i in range(0, len(tempGenes)):
			if(i%4 == 0):
				genes = []
				if (tempGenes[i].count(' (') > 0):
					arr = tempGenes[i].split(' (')
					gene = arr[0].upper()
				else:
					gene = tempGenes[i].upper()
				if (gene.count(',')):
					arr2 = gene.split(',')
					if (arr2[0].count('"') > 0):
						arr3 = arr2[0].split('"')
						genes.append(arr3[1].upper())
					else:
						genes.append(arr2[0])
					if (arr2[1].count('"') > 0):
						arr4 = arr2[1].split('"')
						genes.append(arr4[0].upper())
					else:
						genes.append(arr2[1].upper())
				else:
					genes.append(gene)
				hughGeneList.append(genes)
				prunedList.append(genes[0])
				indices.append(i/4)
				if (len(genes) > 1):
					prunedList.append(genes[1])
					indices.append(i)
		
		
		# put a list of all the genes tested in hughes in index 0
		knockoutInfo.append(prunedList)
		next = geneList[3].split('\t')
		del next[0:2]
		stats = next[0:4]
		#put stat titles in knockout info at index 1
		knockoutInfo.append(stats)
		
		
		#put knockout gene *first column* in knockout info in index 2
		knockouts = []
		line = expression_data.readline()
		while line != '':
				data = line.split('\t')
				knockouts.append(data[0].upper())
				line = expression_data.readline()

		knockoutInfo.append(knockouts)
		knockoutInfo.append(hughGeneList)
		knockoutInfo.append(indices)
		
		return knockoutInfo

def readRegistryGenenames():

#opens the file of genenames and common names, and creates a dictionary where one can find the common name by entering the genename

	openFile = open ('../../../registry.genenames.tab', 'r')
	lines = []
	genenames = {}
	
	for line in openFile:
		lines.append (line)

	for item in lines:
		anArray = item.rsplit ('\t')
		genenames[anArray[-2]] = anArray[0] # in the file, the first item is the common name and the second to last item is the genename

	return genenames

def ourGeneList(bpmList):
	
	ourList = []
	ourListY = []
	ourListCommon = []
	
	for i in range(0, len(bpmList[0])):
		for j in range(0, len(bpmList[0][i])):
			if(ourListY.count(bpmList[0][i][j]) == 0):
				ourListY.append(bpmList[0][i][j])
				
	for i in range(0, len(bpmList[1])):
		for j in range(0, len(bpmList[1][i])):
			if(ourListY.count(bpmList[1][i][j]) == 0):
				ourListY.append(bpmList[1][i][j])

	for i in range(0, len(bpmList[2])):
		for j in range(0, len(bpmList[2][i])):
			if(ourListCommon.count(bpmList[2][i][j]) == 0):
				ourListCommon.append(bpmList[2][i][j])
				
	for i in range(0, len(bpmList[3])):
		for j in range(0, len(bpmList[3][i])):
			if(ourListCommon.count(bpmList[3][i][j]) == 0):
				ourListCommon.append(bpmList[3][i][j])
	
	ourList.append(ourListY)    
	ourList.append(ourListCommon)                                               

	return ourList

def theirListLocations(ourList, results):
	
	theirList = []
	theirListY = []
	theirListCommon = []
		
	for i in range(0, len(ourList[0])):
		if(results.count(ourList[0][i]) != 0):
			location = results.index(ourList[0][i])
			theirListY.append(location)
		else:
			theirListY.append(-1)
	
	for i in range(0, len(ourList[1])):
		if(results.count(ourList[1][i]) != 0):
			location = results.index(ourList[1][i])
			theirListCommon.append(location)
		else:
			theirListCommon.append(-1)
	
	theirList.append(theirListY)
	theirList.append(theirListCommon)

	return theirList

def getHughesColumn(expression_data, offset):
	
		expression_data.seek(0,0)
		returnData = []

		for k in range (0, 4):
				junk = expression_data.readline()
		line = expression_data.readline()
		while line != '':
				temp = []
				data = line.split('\t')
				
				temp.append(data[0])
				temp.append(data[offset*4+2])
				temp.append(data[offset*4+3])
				temp.append(data[offset*4+4])
				temp.append(data[offset*4+5])
				returnData.append(temp)
				line = expression_data.readline()
		
		return returnData

def getSortedMicroArray(inputMicroArray):
	
		micro = []
		for j in range(0, len(inputMicroArray)):
				temp = []
				if inputMicroArray[j][2] != '   NaN':
						temp.append(float(inputMicroArray[j][2])) #convert the third column to a float, take its absolute value, and append it to temp
						temp.append(inputMicroArray[j][0])
						micro.append(temp)
		
		
		micro.sort() #sort the array
		micro.reverse() #reverse it, so it is in the correct order
		
		return micro

#comments
# getSortedArray does the same thing as getSortedMicroArray, except it doesn't include the names in the array
def getSortedArray (inputMicroArray):
	
		micro = []
		for j in range(0, len(inputMicroArray)):
				if inputMicroArray[j][2] != '   NaN':
						micro.append(float(inputMicroArray[j][2]))
		
		
		micro.sort()
		micro.reverse()

		return micro

def getSortedMicroArrayAbs(inputMicroArray):
	
		micro = []
		for j in range(0, len(inputMicroArray)):
				temp = []
				if inputMicroArray[j][2] != '   NaN':
						temp.append(abs(float(inputMicroArray[j][2]))) #convert the third column to a float, take its absolute value, and append it to temp
						temp.append(inputMicroArray[j][0])
						micro.append(temp)
		
		
		micro.sort() #sort the array
		micro.reverse() #reverse it, so it is in the correct order
		
		return micro

#comments
# getSortedArray does the same thing as getSortedMicroArray, except it doesn't include the names in the array
def getSortedArrayAbs (inputMicroArray):
	
		micro = []
		for j in range(0, len(inputMicroArray)):
				if inputMicroArray[j][2] != '   NaN':
						micro.append(abs(float(inputMicroArray[j][2])))
		
		
		micro.sort()
		micro.reverse()

		return micro

#comments
#inputArray is a sorted micro array
#geneset is the overall gene set to test
#index is the index up to which N_r will be calculated
def calculateN_r(inputArray, geneList):
	
		N_r = 0
		for i in range(0,len(inputArray)):
				for j in range(0, len(geneList)):
						#print '---'
						#print 'inputArray ', inputArray[i][1]
						#print '--'
						#print 'geneList ', geneList[j]
						if inputArray[i][1] == geneList[j]:
								N_r += abs(inputArray[i][0])
		
		if N_r == 0:
				print 'ERROR with N_r count'
				return -1
		else:
				return float(N_r)

#comments
# add up the absolute values of the log10 values for an inputArray
def calculateN_r2 (inputArray):
	
		total = 0.0
		
		for i in range (len (inputArray)):
			total += abs(inputArray[i][0])
		
		return total

def generanks (sorted, genelist):
	
		array = []
		n = float(len(sorted))
		sortedNames = []

		sorted.reverse()
				
		for i in range (len (sorted)):

			sortedNames.append (sorted[i][1])

		for item in genelist:

				temp = []
				temp.append (((((n/2) - float(sortedNames.index (item))) / (n/2))))
				temp.append (float(sortedNames.index (item)))
				temp.append(item)
				array.append (temp)       

		array.sort()
		array.reverse()

		return array

def generanksAbs (sorted, genelist):
	
	   array = []

	   sorted.reverse()
	   sortedNames = []
   
	   
	   newSorted = []
	   numberBefore = 0
	   numberDistinct = 0
	   currentDupes = 0
	   currentValue = sorted[0][0]

	   for z in range(len(sorted)):
		   
		   temp = []
		   if(sorted[z][0] != currentValue):
			   numberBefore +=currentDupes
			   currentDupes = 0
			   numberDistinct +=1
			   temp.append(numberBefore)
			   temp.append(sorted[z][0])
			   temp.append(sorted[z][1])
			   newSorted.append(temp)
			   numberBefore +=1
		   else:
			   currentDupes +=1
			   temp.append(numberBefore)
			   temp.append(sorted[z][0])
			   temp.append(sorted[z][1])
			   newSorted.append(temp)
		   currentValue = sorted[z][0]


	   finalSort = []
	   rankWeight = len(sorted)
	   currentDupes = 0
	   currentValue = sorted[0][0]

	   for i in range(len(sorted)):
		   
		   temp = []
		   if(sorted[i][0] != currentValue):
			   rankWeight -= currentDupes
			   currentDupes = 0
			   temp.append(rankWeight)
			   temp.append(newSorted[i][0])
			   temp.append(newSorted[i][1])
			   temp.append(newSorted[i][2])
			   finalSort.append(temp)
			   rankWeight -=1
		   else:
			   currentDupes += 1
			   temp.append(rankWeight)
			   temp.append(newSorted[i][0])
			   temp.append(newSorted[i][1])
			   temp.append(newSorted[i][2])
			   finalSort.append(temp)
		   currentValue = sorted[i][0]


	   returnSet = []
	   for k in range (len(finalSort)):
		   if(counter(genelist, finalSort[k][3]) != 0):
			   returnSet.append(finalSort[k])


	   return returnSet

def counter(list, item):
	
		count = 0
		for i in range(0, len(list)):
				if (list[i] == item):
						count +=1
		return count
		
def gsea(inputMicroArray, geneList, gseaType):
	
		#sort inputMicroArray by log10 score
		sorted = getSortedMicroArray(inputMicroArray)
				
				#switch it to smallest to biggest
				#----optimize later

		sorted.reverse()

		geneRanks = generanksAbs(sorted, geneList)
				
		maxDiff = 0
		
		answer = []
		tempAnswer = 0
		runningHit = 0
		temp = 0
		
				
		nr = calculateN_r2(geneRanks, gseaType)

		N = float(len(inputMicroArray))

		Nh = float(len(geneRanks))
		
				
		for i in range(0, len(geneRanks)):
				
						#test right before we see a new element -
						#misses have reached their max
						if (N == 0):
							N = 1
						if (nr == 0):
							nr = 1
						try:
							temp = (runningHit/nr) - ((geneRanks[i][1])/N)
						except ZeroDivisionError:
							temp = 0
							
						if (abs(temp) > maxDiff):
								maxDiff = abs(temp)
								tempAnswer = temp
				
						#add the element we are on
						runningHit += abs(geneRanks[i][2])

						temp = (runningHit/nr) - ((geneRanks[i][1])/N)   
						
						if (abs(temp) > maxDiff):
								maxDiff = abs(temp)
								tempAnswer = temp

										
		answer.append(tempAnswer)

		return answer


def gsea2(inputMicroArray, geneList):
	
		#sort inputMicroArray by log10 score
		sorted = getSortedMicroArray(inputMicroArray)
				
				#switch it to smallest to biggest
				#----optimize later
		sorted.reverse()

		geneRanks = generanks(sorted, geneList)
				
		maxDiff = 0
		
		answer = []
		tempAnswer = 0
		runningHit = 0
		
				
		nr = calculateN_r2(geneRanks, '2')

		N = float(len(inputMicroArray))

		Nh = float(len(geneRanks))

		
		for i in range(0, len(geneRanks)):
				
						#test right before we see a new element -
						#misses have reached their max
						temp = (runningHit/nr) - ((geneRanks[i][1] - i -1)/N)  
						if (abs(temp) > maxDiff):
								maxDiff = abs(temp)
								tempAnswer = temp
				
						#add the element we are on
						runningHit += abs(geneRanks[i][0])

						temp = (runningHit/nr) - ((geneRanks[i][1] - i)/N)   
						
						if (abs(temp) > maxDiff):
								maxDiff = abs(temp)
								tempAnswer = temp

										
		answer.append(tempAnswer)

		return answer

def compute_rank_cluster_score(inputMicroArray, geneList):
	
		#sort inputMicroArray by log10 score
		sorted = getSortedMicroArrayAbs(inputMicroArray)
				
				#switch it to smallest to biggest
				#----optimize later

		sorted.reverse()
		geneRanks = generanksAbs(sorted, geneList)

		maxDiff = 0
		
		answer = []
		tempAnswer = 0
		runningHit = 0
		temp = 0
		
				
		nr = calculateN_r2(geneRanks)

		N = float(len(sorted)) # sorted does not include the NaN values, which should be ignored when calculating misses

		Nh = float(len(geneRanks))
						
		for i in range(0, len(geneRanks)):
				
						#test right before we see a new element -
						#misses have reached their max
						temp = (runningHit/nr) - ((geneRanks[i][1])/N)  
						if (abs(temp) > maxDiff):
								maxDiff = abs(temp)
								tempAnswer = temp
				
						#add the element we are on
						runningHit += abs(geneRanks[i][0])

						temp = (runningHit/nr) - ((geneRanks[i][1])/N)   
						
						if (abs(temp) > maxDiff):
								maxDiff = abs(temp)
								tempAnswer = temp

										
		answer.append(tempAnswer)
		
		return answer

def get_random_geneset_data(inputMicroArray, geneList):
	
	random.seed()
	pRaw = []
	
	
	#because we want no NaN reuse sort - but don't reuse it can't be processed twice
	#in v2.0 write a clean function
    
	sorted = getSortedMicroArrayAbs (inputMicroArray)
	
	for i in range(0, 99):
			# get a random set of genes 
			randomSet = []
			while len(randomSet) < len(geneList):
					candidate = random.randint(0, len(sorted)-1)
					count = 0
					for j in range(0, len(geneList)):
							if (sorted[candidate][1] == geneList[j]):
									count +=1
					if count == 0:
							#print 'adding ', sorted[candidate][1], ' to list'
							randomSet.append(sorted[candidate][1])
    
			pRaw.append (compute_rank_cluster_score(inputMicroArray, randomSet))

	return pRaw

def get_random_knockout_data(expression_data, knockOutPathway, knockOutPathwayY, EsPathway, EsPathwayY, knockOutList):

	#get random set of test - none of which are in knockout pathway
	#calculate the opposite pathway's es value
	random_CRS_scores = []

	for i in range(0, 99):

		found = 0
		rIndex = 0
		random.seed()
		while found == 0:

			count = 0
			rIndex = random.randint(0,len(knockOutList)-1)
			for a in range(len(knockOutList[rIndex])):
				count += counter(knockOutPathway, knockOutList[rIndex][a]) 
				count += counter(knockOutPathwayY, knockOutList[rIndex][a]) 
				count += counter(EsPathway, knockOutList[rIndex][a])
				count += counter(EsPathwayY, knockOutList[rIndex][a])
				if count == 0:
					found = 1
					break
			#print 'attempting index is', rIndex

		random_hughes_column = getHughesColumn(expression_data, rIndex)
		random_CRS_scores.append (compute_rank_cluster_score(random_hughes_column, EsPathwayY))
	
	return random_CRS_scores


def calculate_p_value(CRS, random_CRS_scores):

	# once all the random values have been appended to random_CRS_scores,
	# sort random_CRS_scores so as to make finding the CRS's rank in random_CRS_scores possible
	random_CRS_scores.sort()
	p = 0.0
    
	# for each item in random_CRS_scores, if p is greater than the CRS, incrememnt p by 1
	for a in range (0, len(random_CRS_scores)):
		if (CRS > random_CRS_scores[a]):
			p += 1.0
    
	# add an extra 1 to p, to account for the fact that counting in computers starts at 0
	p += 1.0
	
	# divide get_random_geneset_data by 100, effectively making it a p value
	p = p/100.0
    
	return 1 - p
	
def collect_bpms_that_meet_criteria(suppsInfo, knockoutInfo):
	
	outputFileString = 'All BPMs.html' # create a file in the directory that was previously created by the program that stores the list of BPMs
	outputFile = open(outputFileString, 'w')           # that meet the required specifications
	outputFile.write ('<head>\n<title> All BPMs.html </title>\n</head><body><p><FONT face=\"Verdana\" size=\"1\">')
	highlight = 0
	genenames = readRegistryGenenames()
	pathway1 = []
	pathway2 = []
	print 'Total BPMs: ', len(suppsInfo[1])

	for i in range (len(suppsInfo[1])): # for every BPM read in either from the user or from file (suppsInfo[1] is the same length as suppsInfo[0])

		temp1 = []
		temp2 = []
		testCounter1 = 0
		testCounter2 = 0

		for j in range (len(suppsInfo[0][i])): # for every member of a pathway in the first column of the current BPM
			try:
				testCounter1 += knockoutInfo[0].count (genenames[suppsInfo [0][i][j]]) # record the total number of tests peformed upon any gene in the pathway
			except KeyError:                                                           # i.e. count the number of genes in the pathway that were knocked out
				try:
					testCounter1 += knockoutInfo[0].count (suppsInfo [0][i][j])
				except KeyError:
					counter +=1
					misses.append (suppsInfo[0][i][j])
					pass

		for k in range (len(suppsInfo[1][i])): # for every member of a pathway in the second column of the current BPM
			try:
				testCounter2 += knockoutInfo[0].count (genenames[suppsInfo[1][i][k]]) # record the total number of tests performed upon any gene in the pathway
			except KeyError:                                                          # i.e. count the number of genes in the pathway that were knocked out
				try:
					testCounter2 += knockoutInfo[0].count (suppsInfo[1][i][k])
				except KeyError:
					counter += 1
					misses.append (suppsInfo[1][i][j])
					pass
		
		if (testCounter1 > 0 and len (suppsInfo[1][i]) > 2) or (testCounter2 > 0 and len (suppsInfo[0][i]) > 2):
		# if pathway 1 contains a gene that was tested and pathway 2 is longer than 2, or if pathway 2 contains a gene that was tested and pathway 1 is longer than 2,
		# add both pathway 1 and pathway 2 as a single BPM to the BPM list
			outputFile.write ('</FONT><FONT style=\"BACKGROUND-COLOR: yellow\"face=\"Verdana\" size=\"1\">\nBPM ')
			outputFile.write (str(i))
			outputFile.write ("(")
			outputFile.write (str(highlight))
			outputFile.write (")!! <br/>")
			if (testCounter1 > 0 and len (suppsInfo[1][i]) > 2) and (testCounter2 <= 0 or len (suppsInfo[0][i]) <= 2):
				outputFile.write ('<u>')
				outputFile.write (str (suppsInfo[0][i]))
				outputFile.write ('</u><br/>')
				outputFile.write (str (suppsInfo[1][i]))
			elif (testCounter2 > 0 and len(suppsInfo[0][i]) > 2) and (testCounter1 <= 0 or len (suppsInfo[1][i]) <= 2):
				outputFile.write (str(suppsInfo[0][i]))
				outputFile.write ('<br/><u>')
				outputFile.write (str (suppsInfo[1][i]))
				outputFile.write ('</u>')
			else:
				outputFile.write ('<u>')
				outputFile.write (str (suppsInfo[0][i]))
				outputFile.write ('<br/>')
				outputFile.write (str (suppsInfo[1][i]))
				outputFile.write ('</u>')
			highlight += 1
			outputFile.write ('</FONT><FONT face=\"Verdana\" size=\"1\"><br/> <br/><br/>')
			pathway1.append (suppsInfo[0][i]) # adding pathway 1 to the list of pathways 1
			pathway2.append (suppsInfo[1][i]) # adding pathway 2 to the list of pathways 2 (since there is a one-to-one correspondence between pathway1s and pathway
											  # 2s, these pathways will appear at the same index in the array of all BPMs)
		else:
			outputFile.write ("BPM ")
			outputFile.write (str(i))
			outputFile.write ("!! <br/>")
			outputFile.write (str (suppsInfo[0][i]))
			outputFile.write ('<br/>')
			outputFile.write (str (suppsInfo[1][i]))
			outputFile.write ('<br/> <br/> <br/>')

	outputFile.write ('</p></body>')
	
	total = []
	total.append(pathway1)
	total.append(pathway2)
	return total

#comments
# compileBPMsY is called by run in order to create an array of all BPMs that meet the minimum requirements. In this case, each BPM must have at least 1 test opposite
# a BPM of length at least 3. Once that requirement is met, both sides of the BPM are added to a BPM list, where column one represents pathway1 and column two
# represents pathway 2.  ALl the items in both columns of this list are the genenames (Y-names), not the common names.
def compileBPMsY(rawResponse, knockouts):
	
	suppsInfo = readBPMs(rawResponse)
			
	total = []
	counter = 0
	misses = []
	collection = collect_bpms_that_meet_criteria (suppsInfo, knockouts)
	total.append (collection[0]) #all the first pathways in the appropriate BPMs
	total.append (collection[1]) #all the second pathways in the appropriate BPMs
	print 'BPMs that meet criteria: ', len (collection[0])

	if rawResponse == "6":

		outputValidBpms()

	return total # return the list of BPMs


#comments
# compileBPMsCommon is a method called by run after run called compileBPMsY. The method takes in a list called suppsInfo,
# which is the same list that compileBPMsY returned. The entire purpose of compileBPMsCommon is to create an exact
# copy of suppsInfo, except with the common names instead of the genenames (Y-names)
def compileBPMsCommon (suppsInfo):
	
	total = []
	genenames = []
	genenames = readRegistryGenenames()
	pathway3 = []
	pathway4 = []
	
	for n in range(len(suppsInfo[0])): # for every BPM in suppsInfo...

		temp3 = []
		temp4 = []

		for o in range (len(suppsInfo[0][n])): # for every gene in pathway 1 of each BPM
			try:
				temp3.append (genenames[suppsInfo[0][n][o]]) # append the common name to the pathway 1 list of common names
			except KeyError:
				temp3.append ('NULL') 

		for p in range (len(suppsInfo[1][n])): # for every gene in pathway 2 of each BPM
			try:
				temp4.append (genenames[suppsInfo[1][n][p]]) # append the common name to the pathway 2 list of common names
			except KeyError:
				temp4.append ('NULL')
		pathway3.append (temp3)
		pathway4.append (temp4)

	total.append (pathway3)
	total.append (pathway4)
	return total # return the total list of BPMs denoted by their common names

#comments
# compileYTestLocations takes in a list named suppsInfo, which is the exact list returned by compileBPMsY when called by
# run. compileYTestLocations then iterates through every BPM, and finds the test locations for each gene in each BPM.
# Since some of the names of the tests are denoted by common names and some are noted by genename, this step is necessary
# for easy access to the tests later on. A -1 is put into the list for each gene that doesn't have a test, either because
# the name of the test is denoted with a common name, or the gene simply doesn't have a test.
def compileYTestLocations (suppsInfo, knockoutInfo):
	
	pathway5 = []
	pathway6 = []
	total = []
	indices = knockoutInfo[4]
	
	for i in range (len(suppsInfo[0])): #for each BPM
		temp1 = []
		temp2 = []
		for l in range (len(suppsInfo[0][i])): # for each item in pathway 1 of the current BPM
			try:
				temp1.append (indices[knockoutInfo[0].index (suppsInfo[0][i][l])]) # finds the index of a test in the knockoutInfo
			except ValueError:                                            # and appends it to the list, unless there is no
				temp1.append (-1)                                         # test, in which case -1 is appended
		for m in range (len(suppsInfo[1][i])): # for each item in pathway 2 of the current BPM
			try:
				 temp2.append (indices[knockoutInfo[0].index (suppsInfo[1][i][m])]) # finds the index of a test in the knockoutInfo
			except ValueError:                                             # and appends it to the list, unless there is
				 temp2.append ( -1)                                        # no test, in which case -1 is appended

		pathway5.append (temp1)
		pathway6.append (temp2)

	total.append (pathway5)
	total.append (pathway6)

	return total # return the combined lists

def compileCommonTestLocations (suppsInfo, knockoutInfo):
	
	total = []
	genenames = []
	genenames = readRegistryGenenames()
	pathway6 = []
	pathway7 = []
	indices = knockoutInfo[4]

	for n in range(len(suppsInfo[0])): # for each BPM

		temp6 = []
		temp7 = []

		for o in range (len(suppsInfo[0][n])): # for each item in pathway 1 of the current BPM
			try:
				tempString = genenames[suppsInfo[0][n][o]] # finds the common name of the current gene, because suppsInfo
			except KeyError:                               # came directly from compileBPMsY, and this method is looking
				tempString = 'Absolutely not there'        # for tests denoted by common names
			try:
				temp6.append (indices[knockoutInfo[0].index (tempString)]) # finds the index of a test in the knockoutInfo
			except ValueError:                              # and appends it to the list, unless there is no test, in which
				temp6.append (-1)                           # case -1 is appended

		for p in range (len(suppsInfo[1][n])):
			try:
				tempString2 = genenames[suppsInfo[1][n][p]] # finds the common name of the current gene, because suppsInfo
			except KeyError:                                # came directly from compileBPMsY, and this method is looking
				tempString2 = 'Absolutely not there'        # for tests denoted by common names
			try:
				temp7.append (indices[knockoutInfo[0].index (tempString2)]) # finds hte index of a test in the knockoutInfo and
			except ValueError:                                     # appends it to the list, unless there is not test, in
				temp7.append (-1)                                  # which case -1 is appended

		pathway6.append (temp6)
		pathway7.append (temp7)
		
	total.append (pathway6)
	total.append (pathway7)

	return total # return the combined lists

# Ask the user which process with BPMs s/he would like to perform
def determine_file_path ():
	
	print '----------------BPM Validation Test----------------\n'
	print 'Please enter the file path for the BPMs\n\n'

	response = raw_input("")
	response = 'BPM Networks/' + response

	return response

# compile a 3D array of BPMs and their locations with their Y and common genenames
def create_bpm_list(response, knock):	
	
	temp1 = compileBPMsY (response, knock) # compile the list of all BPMs, and have them all be represented with their genenames
	temp2 = compileBPMsCommon (temp1) # compile the list of all BPMs represented by their common names from the list of
									  # BPMs denoted with their genenames
	temp3 = compileYTestLocations (temp1, knock) # find the test locations for all BPMs denoted by their genenames
	temp4 = compileCommonTestLocations (temp1, knock) # find the test locations for all BPMs denoted by their common names
	bpmList = []
	bpmList.append (temp1[0]) 
	bpmList.append (temp1[1])
	bpmList.append (temp2[0])
	bpmList.append (temp2[1])
	bpmList.append (temp3[0])
	bpmList.append (temp3[1])
	bpmList.append (temp4[0])
	bpmList.append (temp4[1])
	
	return bpmList

# using the responses from the user, compile the output file strings for use in run()
def get_output_file_strings(gseaType):
	
	outputFileStrings = []

	pFileString = 'Random Geneset P Values.txt'
	pFileKOString = 'Random KO P Values.txt'
	outputFileString = 'Output.html'
	prunedPFileString = 'Pruned Random Geneset P Values.txt'
	prunedPFileKOString = 'Pruned Random KO P Values.txt'
	outputFileStrings.append(pFileString)
	outputFileStrings.append(outputFileString)
	outputFileStrings.append(prunedPFileString)
	outputFileStrings.append(pFileKOString)
	outputFileStrings.append(prunedPFileKOString)
		
	return outputFileStrings

def write_outputfile_header(outputFile, pruneCut, response):
	
	response = response.strip('BPM Networks/')
	response = response.rstrip('.tx')
	outputFile.write ('<head><title><body><h2 alignment: center> BPMs ')
	outputFile.write ('</h2><br/><h3>' + 'Prune Cut-Off: ' + pruneCut)
	outputFile.write ('</h3><br/><br/>\n')

def write_current_bpm_header(currentBPM, outputFile):
	
	outputFile.write ('<p><FONT face=\"Verdana\" size=\"1\">Current BPM ')
	outputFile.write(str(currentBPM))
	outputFile.write ("<br/>")

def initialize_genelist(bpmList, j, side, currentBPM):
	
	genelist = []
	for i in range (len(bpmList[side][currentBPM])):
		genelist.append (bpmList[side][currentBPM][i])
	genelist.remove (bpmList[side][currentBPM][j])
	return genelist

def get_log_sorted_values(temp0, gseaType):
	
	if gseaType == "1" or gseaType == "2":
		sorted = getSortedMicroArray (temp0) # create a sorted array of the log10 values from the hughes column
	else:
		sorted = getSortedMicroArrayAbs (temp0) # create a sorted array of the log10 values from the hughes column taking the absolute values of the log10s
	return sorted
	

def get_gene_ranks_KO(sorted, bpmList, currentBPM, gseaType, side):
	
	if gseaType == "1" or gseaType == "2":
		logs1 = generanksAbs (sorted, bpmList[side][currentBPM]) #generate list of all the gene ranking data from the generanksAbs function for the KO pathway
	else:
		logs1 = generanksAbs (sorted, bpmList[side][currentBPM]) #generate list of all the gene ranking data from the generanksAbs function for the KO pathway
	return logs1

def get_gene_ranks_compensatory(sorted, bpmList, currentBPM, gseaType, side):
	
	if gseaType == "1" or gseaType == "2":
		logs2 = generanksAbs (sorted, bpmList[side][currentBPM]) #generate list of all the gene ranking data from the generanksAbs function for the current pathway
	else:
		logs2 = generanksAbs (sorted, bpmList[side][currentBPM]) #generate list of all the gene ranking data from the generanksAbs function for the current pathway
	return logs2

def set_genenames_KO(logs1):
	
	names1 = []
	for name in logs1:
		names1.append (name[3])
	return names1

def set_genenames_compensatory(logs2):
	
	names2 = []
	for name2 in logs2:
		names2.append (name2[3])
	return names2

def write_current_knockout_stats(bpmList, currentBPM, j, outputFile, sorted, side, otherside):
	
	outputFile.write('-----Knockout Gene:') # print the gene knocked out, both its genename and common name
	outputFile.write(str(bpmList[side][currentBPM][j])) # the knockout's genename
	outputFile.write(' (')
	outputFile.write(str(bpmList[side + 2][currentBPM][j])) # the knockout's common name
	outputFile.write (')')
	outputFile.write('-----<br/><br/>The length of the pathway opposite the knockout: ')
	outputFile.write (str(len(bpmList[otherside][currentBPM]))) # print the length of the pathway opposite the knockout
	outputFile.write ("<br/><br/>The length of the list of log10 values: ")
	outputFile.write (str(len(sorted))) # the length of the list of all log10 values, adjusted depending on the number of NaNs
	outputFile.write ("<br/><br/>")
	outputFile.write("<pre>Pathway 1\t\t\tlog10\t\t# After\t\t# Before\t\t\t\tPathway 2\t\t\tlog10\t\t# After\t\t# Before<br/><br/>\n")

def determine_shorter_pathway(bpmList, currentBPM):
	
	if (len (bpmList[0][currentBPM]) < len (bpmList[1][currentBPM])): # determine which list is shorter
			shorter = 0
	else:
			shorter = 1
	return shorter

def determine_longer_pathway(currentBPM, bpmList):
	
	if (len (bpmList[0][currentBPM]) < len (bpmList[1][currentBPM])): # determine which list is shorter
			longer = 1
	else:
			longer = 0
	return longer

def write_item_KO(bpmList, currentBPM, j, item, outputFile, a, side):
	
#CHANGE FROM =>	if (item == bpmList[side][currentBPM][a] or item == bpmList[side][currentBPM][j]):
	if (bpmList[side][currentBPM][a] == bpmList[side][currentBPM][j]):
	# if the current item in the pathway 1 is the knockout, underline it (item is always the genename)
			outputFile.write ('<u>')
			outputFile.write (item)
			outputFile.write ('</u>')
	else: # otherwise just write the item (item is always the genename)
			outputFile.write (item)

def write_item_compensatory(bpmList, currentBPM, a, item, outputFile, side, good):
	
	if (good == "yes"):
		outputFile.write("<span style='background:yellow;'>")
	outputFile.write (bpmList[side][currentBPM][a]) # print the opposite pathway's (pathway 2) ath gene's genename
	outputFile.write ('(')
	outputFile.write (bpmList[side + 2][currentBPM][a]) # print the opposite pathway's (pathway 2) ath gene's common name
	outputFile.write (')')
	
def write_items_common_name(bpmList, currentBPM, a, j, outputFile, side):
	
	outputFile.write ('(')
	if (bpmList[side][currentBPM][a] == bpmList[side][currentBPM][j]):
	# if the ath gene in the shorter pathway is the knockout, print the common name of the
	# knocked out gene and underline it
			outputFile.write ('<u>')
			outputFile.write (bpmList[side][currentBPM][a])
			outputFile.write ('</u>')
	else: # otherwise just output the common name for the ath gene in the shorter pathway
			outputFile.write (bpmList[side][currentBPM][a])

	outputFile.write(')')

def write_log_and_rank(temp0, x, currentBPM, bpmList, outputFile, logs1, names1, a, side):
	
	if ((bpmList[side][currentBPM][a] == temp0[x][0]) or  (bpmList[side+2][currentBPM][a] == temp0[x][0])):
	# if the genename/common name of the ath gene in the pathway (pathway 1) is equal to the name
	# stored in the zth list in temp 0[z][0] (temp0[z][0] holds the name of the gene whose information
	# is stored in temp0[z][1], temp0[z][2], and temp0[z][3] for the current list z)
		outputFile.write ('\t\t\t')
		outputFile.write (temp0[x][2] + '\t\t')
		# write the information stored in the third column (log10 value) for the current gene
		try: # try to print the rank for the curent gene's log10 value among all the log10 values calculated and the # of log10 values after the gene
			 # for the current knockout (this calculation is done in generanksAbs previously).  
			outputFile.write (str(logs1[names1.index (temp0[x][0])][0]) + '\t\t' + str(logs1[names1.index (temp0[x][0])][1]))
			return logs1[names1.index (temp0[x][0])][0]
		except ValueError:
			outputFile.write ("N/A\t\tN/A")
			return -0.001
	else:
		return 0

def get_rank(temp0, x, currentBPM, bpmList, outputFile, logs1, names1, a, side):

	if ((bpmList[side][currentBPM][a] == temp0[x][0]) or  (bpmList[side+2][currentBPM][a] == temp0[x][0])):
		try:
			return logs1[names1.index (temp0[x][0])][0]
		except ValueError:
			return 0
	else:
		return 0

		
def write_scores(outputFile, pFile, pFileKO, CRS_left, CRS_right, random_genesets_pValLeft, random_genesets_pVal, random_knockouts_pValLeft, random_knockouts_pVal, currentBPM):
	
	outputFile.write ("<br/><br/>ES Random Genesets = ")
	outputFile.write (str(CRS_left)) #output the value calculated by the compute_cluster_rank_score method for the KO pathway
	outputFile.write ("\t\tES Random KOs = ") 
	outputFile.write (str(CRS_left)) #output the value calculated by the compute_cluster_rank_score method for the KO pathway
	outputFile.write ("\t\tES Random Genesets = ")
	outputFile.write (str(CRS_right)) #output the value calculated by the compute_cluster_rank_score method for the opposite pathway
	outputFile.write ("\t\t\tES Random KOs = ") 
	outputFile.write (str(CRS_right)) #output the value calculated by the compute_cluster_rank_score method for the opposite pathway
	outputFile.write ("<br/><br/>p Random Genesets = ")
	outputFile.write (str(random_genesets_pValLeft)) # output the p value calculated by the get_random_geneset_data method for the KO pathway (see above)
	outputFile.write ("\t\t\t\tp Random KOs = ")
	outputFile.write (str(random_knockouts_pValLeft)) # output the p value calculated by the get_random_knockout_data method for the current pathway (see above)
	outputFile.write ("\t\t\t\tp Random Genesets = ")
	outputFile.write (str(random_genesets_pVal)) # output the p value calculated by the get_random_geneset_data method for the KO pathway (see above)
	outputFile.write ("\t\t\t\tp Random KOs = ")
	outputFile.write (str(random_knockouts_pVal)) # output the p value calculated by the get_random_knockout_data method for the current pathway (see above)
	pFile.write (str(random_genesets_pVal)) # output the p value to a separate file for easy histogram creation later
	pFile.write ('\t')
	pFile.write (str (currentBPM)) #output the BPM of the p value to the same file as the get_random_geneset_datas, so finding validated BPMs can be done easily
	pFile.write ('\t1\n')
	pFileKO.write(str(random_knockouts_pVal) + '\t' + str(currentBPM) + '\t1\n')
	
def write_pruned(pruned, pruned_common, temp, outputFile, prunedPFile, prunedPFileKO, i, knockout_pathway_common, knockout_pathway_Y, expression_data, knockout_genes_list):
	if (len(pruned) > 2):
		CRS_right = compute_rank_cluster_score(temp, pruned)
		random_genesets_CRS_scores = get_random_geneset_data(temp, pruned)
		random_knockouts_CRS_scores = get_random_knockout_data(expression_data, knockout_pathway_common, knockout_pathway_Y, pruned_common, pruned, knockout_genes_list)
		random_genesets_pVal = calculate_p_value(CRS_right, random_genesets_CRS_scores)
		random_knockouts_pVal = calculate_p_value(CRS_right, random_knockouts_CRS_scores)
		prunedPFile.write(str(random_genesets_pVal) + '\t' + str(i) + '\t1\n')
		prunedPFileKO.write(str(random_knockouts_pVal) + '\t' + str(i) + '\t1\n')
	else:
		CRS_right = 'N/A'
		random_genesets_pVal = 'N/A'
		random_knockouts_pVal = 'N/A'
	
	outputFile.write('\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tES Pruned Random Genesets =' + str(CRS_right) + 'p Pruned Random Genesets = ' + str(random_genesets_pVal) + '\n')
	outputFile.write('\t\t\t\t\t\t\t\t\t\t\t\t\tES Pruned Random KOs =' + str(CRS_right) + '\tp Pruned Random KOs = ' + str(random_knockouts_pVal))
	outputFile.write ("<br/><br/>")
	outputFile.write("<hr/><br/></pre><br/>\n")
		
	
def KO_output(genelist, bpmList, j, currentBPM, temp, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, pruneCut, expression_data, knockout_genes_list):
	
	CRS_right = compute_rank_cluster_score(temp, bpmList[1][currentBPM])
	CRS_left = compute_rank_cluster_score(temp, genelist)
		
	random_genesets_CRS_scores = get_random_geneset_data(temp, bpmList[1][currentBPM]) # calculate the get_random_geneset_data from the gsea value of the current pathway
	random_knockouts_CRS_scores = get_random_knockout_data(expression_data, bpmList[2][currentBPM], bpmList[0][currentBPM], bpmList[3][currentBPM], bpmList[1][currentBPM], knockout_genes_list)

	random_genesets_pVal = calculate_p_value(CRS_right, random_genesets_CRS_scores)
	random_knockouts_pVal = calculate_p_value(CRS_right, random_knockouts_CRS_scores)
	
	random_genesets_CRS_scoresLeft = get_random_geneset_data(temp, genelist)
	random_knockouts_CRS_scoresLeft = get_random_knockout_data(expression_data, bpmList[2][currentBPM], bpmList[0][currentBPM], bpmList[2][currentBPM], bpmList[0][currentBPM], knockout_genes_list)

	random_genesets_pValLeft = calculate_p_value(CRS_left, random_genesets_CRS_scoresLeft)
	random_knockouts_pValLeft = calculate_p_value(CRS_left, random_knockouts_CRS_scoresLeft)
			
	sorted = get_log_sorted_values(temp, gseaType)
	logs1 = get_gene_ranks_KO(sorted, bpmList, currentBPM, gseaType, 0)
	logs2 = get_gene_ranks_compensatory(sorted, bpmList, currentBPM, gseaType, 1)

	names1 = set_genenames_KO(logs1) #names1 and names2 will be used to store the names of the genes in logs1 and logs2, stored with the same indices as their corresponding information in logs1 and logs2,
	names2 = set_genenames_compensatory(logs2) #so names1 and names2 effectively allow easy access to logs1 and logs2
	
	write_current_knockout_stats(bpmList, currentBPM, j, outputFile, sorted, 0, 1)
	
	shorter = determine_shorter_pathway(bpmList, currentBPM)
	longer = determine_longer_pathway(currentBPM, bpmList)
	pruned = []
	pruned_common = []

	a = 0 # a counter that holds the number of iterations through the pathways that have occurred
	
	for item in bpmList[shorter][currentBPM]: # for every gene in the shorter pathway of the current BPM
			found = 0 # a boolean that tells whether or not the gene's log10 data has been found in temp during write_log_and_ranks
			good = "not yet"
			if shorter == 0: # always print the side with the knockout on the left, so if the side with the
							 # knockout is shorter, then output current item
					write_item_KO(bpmList, currentBPM, j, item, outputFile, a, 0) #BIG CHANGE => both a's used to be j's
					write_items_common_name(bpmList, currentBPM, a, j, outputFile, 2)
# CHANGE =>					outputFile.write ('\t\t\t\t\t')
					
					found = 0
					for x in range (len (temp)): # for the size of the hughes column
						if (found != 0):
							x = len(temp) + 1
						else:
							found = write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 0)
					
					found = 0
					outputFile.write ('\t\t\t\t\t')
					for z in range (len (temp)): # for the size of the hughes column
						rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1)
						if rank != 0:
							break
					
					if (float(float(rank)/float(len(temp))) > pruneCut):
						good = "yes"
						pruned.append(bpmList[longer][currentBPM][a])
						pruned_common.append(bpmList[longer + 2][currentBPM][a])
					else:
						good = "no"
					
					write_item_compensatory(bpmList, currentBPM, a, item, outputFile, 1, good)

					write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1)
					
					if good == "yes":
						outputFile.write("</span>")

					outputFile.write ('<br/>')
					found = 0
					a+=1 # for each iteration, increment a
					
			else: # if the shorter list isn't 0, but is in fact 1
					write_item_KO(bpmList, currentBPM, j, bpmList[longer][currentBPM][a], outputFile, a, 0)
					write_items_common_name(bpmList, currentBPM, a, j, outputFile, 2)
					found = 0
					
					for x in range (len (temp)): # for the size of the hughes column
						if (found != 0):
							x = len(temp) + 1
						else:
							found = write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 0)

					outputFile.write ('\t\t\t\t\t')
					found = 0
					for z in range (len (temp)): # for the size of the hughes column
						rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1)
						if rank != 0:
							break
					
					if (float(float(rank)/float(len(temp))) > pruneCut):
						good = "yes"
						pruned.append(bpmList[shorter][currentBPM][a])
						pruned_common.append(bpmList[shorter + 2][currentBPM][a])
					else:
						good = "no"
					
					write_item_compensatory(bpmList, currentBPM, a, item, outputFile, 1, good)

					found = write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1)

					if good == "yes":
						outputFile.write("</span>")
                    
					outputFile.write ('<br/>')
					found = 0
					a+=1 # increment a

	# this while loop performs the same operations as above for the longer list, because the above
	# loop finished the smaller list
	while a < len(bpmList[longer][currentBPM]):
			found = 0
			good = "not yet"
			if longer == 1: # if longer is 1, then it doesn't contain the knockout, so is on the right,
							# so it must be moved to align with the pathway 2 information that was written above
					outputFile.write ('\t\t\t\t\t\t\t\t\t\t\t\t\t')
					found = 0
					for z in range (len (temp)): # for the size of the hughes column
						rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1)
						if rank != 0:
							break

					if (float(float(rank)/float(len(temp))) > pruneCut):
						good = "yes"
						pruned.append(bpmList[longer][currentBPM][a])
						pruned_common.append(bpmList[longer + 2][currentBPM][a])
					else:
						good = "no"
                    
					write_item_compensatory(bpmList, currentBPM, a, bpmList[longer][currentBPM][a], outputFile, 1, good) #BIG CHANGE => changed item to bpmList[longer][currentBPM][a]
				
					for z in range (len (temp)): # for the size of the hughes column
						if (found != 0):
							z = len(temp) + 1
						else:
							found = write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1) #CHANGE => 1 to 2

					if good == "yes":
						outputFile.write("</span>")
					
					found = 0
					outputFile.write ('<br/>')
					a+=1 # increment a
					
			else: # if longer is shorter, and therefore contains the knockout
					write_item_KO(bpmList, currentBPM, j, bpmList[longer][currentBPM][a], outputFile, a, 0)
					write_items_common_name(bpmList, currentBPM, a, j, outputFile, 2)
					found = 0
					
					for x in range (len (temp)): # for the size of the hughes column
						if (found != 0):
							x = len(temp) + 1
						else:
							found = write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 0)
											      
					found = 0
					outputFile.write ('<br/>')
					a+=1
	
	write_scores(outputFile, pFile, pFileKO, CRS_left, CRS_right, random_genesets_pValLeft, random_genesets_pVal, random_knockouts_pValLeft, random_knockouts_pVal, currentBPM)
	write_pruned(pruned, pruned_common, temp, outputFile, prunedPFile, prunedPFileKO, currentBPM, bpmList[2][currentBPM], bpmList[0][currentBPM], expression_data, knockout_genes_list)
	
def compensatory_output(genelist, bpmList, k, currentBPM, temp, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, pruneCut, expression_data, knockout_genes_list):	
	
	CRS_right = compute_rank_cluster_score(temp, bpmList[0][currentBPM])
	CRS_left = compute_rank_cluster_score(temp, genelist)
		
	random_genesets_CRS_scores = get_random_geneset_data(temp, bpmList[0][currentBPM]) # calculate the get_random_geneset_data from the gsea value of the current pathway
	random_knockouts_CRS_scores = get_random_knockout_data(expression_data, bpmList[3][currentBPM], bpmList[1][currentBPM], bpmList[2][currentBPM], bpmList[0][currentBPM], knockout_genes_list)

	random_genesets_pVal = calculate_p_value(CRS_right, random_genesets_CRS_scores)
	random_knockouts_pVal = calculate_p_value(CRS_right, random_knockouts_CRS_scores)
	
	random_genesets_CRS_scoresLeft = get_random_geneset_data(temp, genelist)
	random_knockouts_CRS_scoresLeft = get_random_knockout_data(expression_data, bpmList[3][currentBPM], bpmList[1][currentBPM], bpmList[3][currentBPM], bpmList[1][currentBPM], knockout_genes_list)

	random_genesets_pValLeft = calculate_p_value(CRS_left, random_genesets_CRS_scores)
	random_knockouts_pValLeft = calculate_p_value(CRS_left, random_knockouts_CRS_scores)

    
	sorted = get_log_sorted_values(temp, gseaType)
	logs1 = get_gene_ranks_KO(sorted, bpmList, currentBPM, gseaType, 1)
	logs2 = get_gene_ranks_compensatory(sorted, bpmList, currentBPM, gseaType, 0)
    
	names1 = set_genenames_KO(logs1) #names1 and names2 will be used to store the names of the genes in logs1 and logs2, stored with the same indices as their corresponding information in logs1 and logs2,
	names2 = set_genenames_compensatory(logs2) #so names1 and names2 effectively allow easy access to logs1 and logs2
    
	write_current_knockout_stats(bpmList, currentBPM, k, outputFile, sorted, 1, 0)
	shorter = determine_shorter_pathway(bpmList, currentBPM)
	longer = determine_longer_pathway(currentBPM, bpmList)
	pruned = []
	pruned_common = []
	a = 0

	for item in bpmList[shorter][currentBPM]:
			found = 0
			good = "not yet"
			if shorter == 1:
					write_item_KO(bpmList, currentBPM, k, bpmList[1][currentBPM][a], outputFile, a, 1) #BIG CHANGE => a used to be k, bpmList[1][currentBPM][a] used to be item
					write_items_common_name(bpmList, currentBPM, a, k, outputFile, 3)
					found = 0
					
					for x in range (len (temp)):
						if (found == 1):
							x = len(temp) + 1
						else:
							found = write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 1)
    
					found = 0
					outputFile.write ('\t\t\t\t\t')
					for z in range (len (temp)): # for the size of the hughes column
						rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0)
						if rank != 0:
							break

					if (float(float(rank)/float(len(temp))) > pruneCut):
						good = "yes"
						pruned.append(bpmList[longer][currentBPM][a])
						pruned_common.append(bpmList[longer + 2][currentBPM][a])
					else:
						good = "no"

					write_item_compensatory(bpmList, currentBPM, a, item, outputFile, 0, good)
    
					write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0)
    
					if good == "yes":
						outputFile.write("</span>")
					
					found = 0
					outputFile.write ('<br/>')
					a+=1
    
			else:
					write_item_KO(bpmList, currentBPM, k, bpmList[1][currentBPM][a], outputFile, a, 1)#BIG CHANGE => a used to be k
					write_items_common_name(bpmList, currentBPM, a, k, outputFile, 3)
					found = 0
					
					for x in range (len (temp)):
						if (found == 1):
							x = len(temp) + 1
						else:
							write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 1)
    
					found = 0
					outputFile.write ('\t\t\t\t\t')
					for z in range (len (temp)): # for the size of the hughes column
						rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0)
						if rank != 0:
							break

					if (float(float(rank)/float(len(temp))) > pruneCut):
						good = "yes"
						pruned.append(bpmList[shorter][currentBPM][a])
						pruned_common.append(bpmList[shorter + 2][currentBPM][a])
					else:
						good = "no"

					write_item_compensatory(bpmList, currentBPM, a, item, outputFile, 0, good)
    
					found = write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0)

					if good == "yes":
						outputFile.write("</span>")
                    
					found = 0
					outputFile.write ('<br/>')
					a+=1
    
	for a in range (a, len(bpmList[longer][currentBPM])): #BIG CHANGE => changed from range(len(bpmList[longer][currentBPM]))
    
			found = 0
			good = "not yet"
			if longer == 0:
    
				outputFile.write ('\t\t\t\t\t\t\t\t\t\t\t\t\t')
				for z in range (len (temp)): # for the size of the hughes column
					rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0)
					if rank != 0:
						break

				if (float(float(rank)/float(len(temp))) > pruneCut):
					good = "yes"
					pruned.append(bpmList[longer][currentBPM][a])
					pruned_common.append(bpmList[longer + 2][currentBPM][a])
				else:
					good = "no"
				
				write_item_compensatory(bpmList, currentBPM, a, bpmList[longer][currentBPM][a], outputFile, 0, good) #BIG CHANGE => bpmList[longer][currentBPM][a] used to be item
				found = 0    
				
				found = write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0)
                
				if good == "yes":
					outputFile.write("</span>")
                
				found = 0
				outputFile.write ('<br/>')
				a+=1

			else:
					write_item_KO(bpmList, currentBPM, k, bpmList[longer][currentBPM][a], outputFile, a, 1) #BIG CHANGE => a used to be k
					write_items_common_name(bpmList, currentBPM, a, k, outputFile, 3)
					found = 0
					
					for x in range (len (temp)):
						if (found == 1):
							x = len(temp) + 1
						else:
							found = write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 1)

					found = 0
					outputFile.write ('<br/>')
					a+=1
  
	write_scores(outputFile, pFile, pFileKO, CRS_left, CRS_right, random_genesets_pValLeft, random_genesets_pVal, random_knockouts_pValLeft, random_knockouts_pVal, currentBPM)
	write_pruned(pruned, pruned_common, temp, outputFile, prunedPFile, prunedPFileKO, currentBPM, bpmList[3][currentBPM], bpmList[1][currentBPM], expression_data, knockout_genes_list)

def run():


	response = determine_file_path()
	expression_file_string = raw_input('\nPlease enter the file path to the expression data you would like to use.\n')
	if (expression_file_string == "hughes"):
		expression_file_string = "./Expression Data/data_expts1-300_meas.txt"
	expression_data = open(expression_file_string, 'r')
	gseaType = '3'
	pruneCut = raw_input("\nWhat would you like to set the pruning cutoff to?\n")
	file_strings = get_output_file_strings(gseaType)
	
	knock = getKnockouts(expression_data)
	knockout_genes_list = knock[3]
	bpmList = create_bpm_list(response, knock)
	# the bpmList contains all the information gathered above in one list, where columns [0] & [1] contain
	# the genenames of pathways 1 and 2 for all BPMs, columns [2] & [3] contain the common names of pathways
	# 1 and 2 for all BPMs, columns [4] & [5] contain the test locations of all the genes in pathways 1 and
	# 2 of each BPM that are denoted by their genename, and columns [6] & [7] contain the test locations of
	# all the genes in pathways 1 and 2 of each BPM that are denoted by their common name
	ourList = ourGeneList(bpmList)
	theirList = theirListLocations(ourList, knock[0])
	
	pFile = open(file_strings[0], 'w')
	outputFile = open(file_strings[1], 'w')
	prunedPFile = open (file_strings[2], 'w')
	pFileKO = open(file_strings[3], 'w')
	prunedPFileKO = open(file_strings[4], 'w')


	write_outputfile_header(outputFile, pruneCut, response)
	
	print 'Beginning...'
	for i in range(0, len(bpmList[0])): # for every BPM
		
		print
		print '\tBPM ', str(i), '...'
		print
		currentBPM = i
		write_current_bpm_header(currentBPM, outputFile)
		
		for j in range(0, len(bpmList[0][currentBPM])): # for every gene in pathway 1 of the current BPM
			print '\t\tLooking at gene ', bpmList[0][currentBPM][j], ' for possible KO output'
			if ((bpmList[4][currentBPM][j] != -1) and (len(bpmList[1][currentBPM]) > 2)):
			# if there is a test for the current gene, and the length of the pathway on the opposite side is greater than 2
				genelist = initialize_genelist(bpmList, j, 0, currentBPM)
				temp0 = getHughesColumn(expression_data, bpmList[4][currentBPM][j]) # get the corresponding column from the hughes file
				print '\t\tUsing gene ', bpmList[0][currentBPM][j], ' as the KO'
				KO_output(genelist, bpmList, j, currentBPM, temp0, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, float(pruneCut), expression_data, knockout_genes_list)

				
			elif ((bpmList[6][currentBPM][j] != -1) and (len(bpmList[1][currentBPM]) > 2)):
				# if  there was no test denoted by a genename, check if there is a test denoted by a common name for the
				# current gene in pathway 1 of the current BPM, and make sure the opposite pathway has length > 2

				genelist = initialize_genelist(bpmList, j, 0, currentBPM)
				temp1 = getHughesColumn(expression_data, bpmList[6][currentBPM][j])
				print '\t\tUsing gene ', bpmList[0][currentBPM][j], ' as the KO'
				KO_output(genelist, bpmList, j, currentBPM, temp1, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, float(pruneCut), expression_data, knockout_genes_list)
			
		
		print '\t\tFinished First Pathway...'
		print
		for k in range(0, len(bpmList[1][currentBPM])): # for every gene in pathway 2 in the current BPM
			print '\t\tLooking at gene ', bpmList[1][currentBPM][k], ' for possible compensatory output'
			if  ((bpmList[5][currentBPM][k] != -1) and (len(bpmList[0][currentBPM]) > 2)):
			# if there is a test denoted by genename for the current gene and the opposite pathway has length > 2
														  
				genelist = initialize_genelist(bpmList, k, 1, currentBPM)
				temp2 = getHughesColumn(expression_data, bpmList[5][currentBPM][k])
				print '\t\tUsing gene ', bpmList[1][currentBPM][k], ' as the KO'
				compensatory_output(genelist, bpmList, k, currentBPM, temp2, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, float(pruneCut), expression_data, knockout_genes_list)

			elif ((bpmList[7][currentBPM][k] != -1) and (len(bpmList[0][currentBPM]) > 2)):
			# if the wasn't a test denoted by genename for the current gene but there is one denoted by common name
			# and the length of the opposite pathway is greater than 2

				genelist = initialize_genelist(bpmList, k, 1, currentBPM)
				temp3 = getHughesColumn(expression_data, bpmList[7][currentBPM][k])
				print '\t\tUsing gene ', bpmList[1][currentBPM][k], ' as the KO'
				compensatory_output(genelist, bpmList, k, currentBPM, temp3, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, float(pruneCut), expression_data, knockout_genes_list)
		
		print '\t\tFinished Second Pathway...'

		outputFile.write ('</p>')    
				
	outputFile.close()
										
				
	print '********************************************'        
		
run()